title: “Beers_Analysis” author: “Sachin Chavan” date: “10/13/2019” output: html_document —
breweries_ds <- read.csv('data/Breweries.csv',header = TRUE)
beers_ds <- read.csv('data/Beers.csv' ,header = TRUE)
bmerged_ds <- merge(breweries_ds,beers_ds,by.x="Brew_ID",by.y="Brewery_id")
names(bmerged_ds)[2] <- "BreweryName"
names(bmerged_ds)[5] <- "BeerName"
str(breweries_ds)
## 'data.frame': 558 obs. of 4 variables:
## $ Brew_ID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 12 266 319 201 136 227 477 59 491 ...
## $ City : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 200 122 299 300 62 91 48 152 136 ...
## $ State : Factor w/ 51 levels " AK"," AL"," AR",..: 24 18 20 5 5 41 6 23 23 23 ...
str(beers_ds)
## 'data.frame': 2410 obs. of 7 variables:
## $ Name : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1638 577 1704 1842 1819 268 1160 758 1093 486 ...
## $ Beer_ID : int 1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
## $ ABV : num 0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
## $ IBU : int NA NA NA NA NA NA NA NA NA NA ...
## $ Brewery_id: int 409 178 178 178 178 178 178 178 178 178 ...
## $ Style : Factor w/ 100 levels "","Abbey Single Ale",..: 19 18 16 12 16 80 18 22 18 12 ...
## $ Ounces : num 12 12 12 12 12 12 12 12 12 12 ...
str(bmerged_ds)
## 'data.frame': 2410 obs. of 10 variables:
## $ Brew_ID : int 1 1 1 1 1 1 2 2 2 2 ...
## $ BreweryName: Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 355 355 355 355 355 12 12 12 12 ...
## $ City : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 228 228 228 228 228 200 200 200 200 ...
## $ State : Factor w/ 51 levels " AK"," AL"," AR",..: 24 24 24 24 24 24 18 18 18 18 ...
## $ BeerName : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1640 1926 1525 802 1258 2185 71 458 1218 43 ...
## $ Beer_ID : int 2689 2688 2687 2692 2691 2690 2683 2686 2685 2684 ...
## $ ABV : num 0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
## $ IBU : int 38 25 47 50 26 19 42 68 80 25 ...
## $ Style : Factor w/ 100 levels "","Abbey Single Ale",..: 83 22 57 16 77 48 18 12 46 77 ...
## $ Ounces : num 16 16 16 16 16 16 16 16 16 16 ...
head(bmerged_ds)
## Brew_ID BreweryName City State BeerName Beer_ID ABV
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048
## IBU Style Ounces
## 1 38 Pumpkin Ale 16
## 2 25 American Porter 16
## 3 47 Extra Special / Strong Bitter (ESB) 16
## 4 50 American IPA 16
## 5 26 Milk / Sweet Stout 16
## 6 19 English Brown Ale 16
tail(bmerged_ds)
## Brew_ID BreweryName City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## BeerName Beer_ID ABV IBU Style
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA)
## 2407 Snapperhead IPA 51 0.068 NA American IPA
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale
## Ounces
## 2405 12
## 2406 12
## 2407 12
## 2408 12
## 2409 12
## 2410 12
summary(breweries_ds)
## Brew_ID Name City
## Min. : 1.0 Blackrocks Brewery : 2 Portland: 17
## 1st Qu.:140.2 Blue Mountain Brewery : 2 Boulder : 9
## Median :279.5 Lucette Brewing Company: 2 Chicago : 9
## Mean :279.5 Oskar Blues Brewery : 2 Seattle : 9
## 3rd Qu.:418.8 Otter Creek Brewing : 2 Austin : 8
## Max. :558.0 Sly Fox Brewing Company: 2 Denver : 8
## (Other) :546 (Other) :498
## State
## CO : 47
## CA : 39
## MI : 32
## OR : 29
## TX : 28
## PA : 25
## (Other):358
summary(beers_ds)
## Name Beer_ID ABV
## Nonstop Hef Hop : 12 Min. : 1.0 Min. :0.00100
## Dale's Pale Ale : 6 1st Qu.: 808.2 1st Qu.:0.05000
## Oktoberfest : 6 Median :1453.5 Median :0.05600
## Longboard Island Lager: 4 Mean :1431.1 Mean :0.05977
## 1327 Pod's ESB : 3 3rd Qu.:2075.8 3rd Qu.:0.06700
## Boston Lager : 3 Max. :2692.0 Max. :0.12800
## (Other) :2376 NA's :62
## IBU Brewery_id Style
## Min. : 4.00 Min. : 1.0 American IPA : 424
## 1st Qu.: 21.00 1st Qu.: 94.0 American Pale Ale (APA) : 245
## Median : 35.00 Median :206.0 American Amber / Red Ale : 133
## Mean : 42.71 Mean :232.7 American Blonde Ale : 108
## 3rd Qu.: 64.00 3rd Qu.:367.0 American Double / Imperial IPA: 105
## Max. :138.00 Max. :558.0 American Pale Wheat Ale : 97
## NA's :1005 (Other) :1298
## Ounces
## Min. : 8.40
## 1st Qu.:12.00
## Median :12.00
## Mean :13.59
## 3rd Qu.:16.00
## Max. :32.00
##
summary(bmerged_ds)
## Brew_ID BreweryName City
## Min. : 1.0 Brewery Vivant : 62 Grand Rapids: 66
## 1st Qu.: 94.0 Oskar Blues Brewery : 46 Portland : 64
## Median :206.0 Sun King Brewing Company : 38 Chicago : 55
## Mean :232.7 Cigar City Brewing Company: 25 Indianapolis: 43
## 3rd Qu.:367.0 Sixpoint Craft Ales : 24 San Diego : 42
## Max. :558.0 Hopworks Urban Brewery : 23 Boulder : 41
## (Other) :2192 (Other) :2099
## State BeerName Beer_ID
## CO : 265 Nonstop Hef Hop : 12 Min. : 1.0
## CA : 183 Dale's Pale Ale : 6 1st Qu.: 808.2
## MI : 162 Oktoberfest : 6 Median :1453.5
## IN : 139 Longboard Island Lager: 4 Mean :1431.1
## TX : 130 1327 Pod's ESB : 3 3rd Qu.:2075.8
## OR : 125 Boston Lager : 3 Max. :2692.0
## (Other):1406 (Other) :2376
## ABV IBU Style
## Min. :0.00100 Min. : 4.00 American IPA : 424
## 1st Qu.:0.05000 1st Qu.: 21.00 American Pale Ale (APA) : 245
## Median :0.05600 Median : 35.00 American Amber / Red Ale : 133
## Mean :0.05977 Mean : 42.71 American Blonde Ale : 108
## 3rd Qu.:0.06700 3rd Qu.: 64.00 American Double / Imperial IPA: 105
## Max. :0.12800 Max. :138.00 American Pale Wheat Ale : 97
## NA's :62 NA's :1005 (Other) :1298
## Ounces
## Min. : 8.40
## 1st Qu.:12.00
## Median :12.00
## Mean :13.59
## 3rd Qu.:16.00
## Max. :32.00
##
b_states <- breweries_ds %>% group_by(State) %>% count() %>% arrange(desc(n))
b_states$State <- factor(b_states$State, levels = b_states$State)
ggplot(b_states, aes(x=State, y=n)) +
geom_bar(stat="identity", width=.8, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="State Vs Number of Breweries",
caption="Beer Analysis") +
xlab("State")+
ylab("Total Breweries")+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.6))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=10),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
vis_dat(bmerged_ds,palette = "qual")
beer_ounces <- beers_ds
beer_ounces$Ounces <- as.factor(beer_ounces$Ounces)
gg_miss_var(bmerged_ds,facet=State)+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=8),
axis.text.y = element_text(vjust=0.2,size=4),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
gg_miss_var(bmerged_ds,facet=Style)+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.3)),
axis.text.x = element_text(vjust=0.6,size=8),
axis.text.y = element_text(vjust=0.2,size=4),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
gg_miss_var(beer_ounces,facet=Ounces)+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=10),
axis.text.y = element_text(vjust=0.2,size=7),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
### Missing Values continued * Missing values by state * Missing values by Style
aq_shadow <- bind_shadow(bmerged_ds)
glimpse(aq_shadow)
## Observations: 2,410
## Variables: 20
## $ Brew_ID <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ BreweryName <fct> NorthGate Brewing , NorthGate Brewing , NorthGate…
## $ City <fct> Minneapolis, Minneapolis, Minneapolis, Minneapoli…
## $ State <fct> MN, MN, MN, MN, MN, MN, KY, KY, KY, KY,…
## $ BeerName <fct> Pumpion, Stronghold, Parapet ESB, Get Together, M…
## $ Beer_ID <int> 2689, 2688, 2687, 2692, 2691, 2690, 2683, 2686, 2…
## $ ABV <dbl> 0.060, 0.060, 0.056, 0.045, 0.049, 0.048, 0.042, …
## $ IBU <int> 38, 25, 47, 50, 26, 19, 42, 68, 80, 25, 17, 25, 2…
## $ Style <fct> Pumpkin Ale, American Porter, Extra Special / Str…
## $ Ounces <dbl> 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1…
## $ Brew_ID_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ BreweryName_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ City_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ State_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ BeerName_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Beer_ID_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ABV_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ IBU_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Style_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Ounces_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
bmerged_ds %>%
bind_shadow() %>%
group_by(IBU_NA) %>%
summarise_at(.vars = "ABV",
.funs = c("mean", "sd", "var", "min", "max"),
na.rm = TRUE)
## # A tibble: 2 x 6
## IBU_NA mean sd var min max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 !NA 0.0599 0.0136 0.000184 0.027 0.125
## 2 NA 0.0596 0.0135 0.000182 0.001 0.128
ggplot(aq_shadow,
aes(x = ABV,
colour = IBU_NA)) +
geom_density()
## Warning: Removed 62 rows containing non-finite values (stat_density).
ggplot(bmerged_ds,
aes(x = ABV,
y = IBU)) +
geom_miss_point() +
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=8),
axis.text.y = element_text(vjust=0.2,size=8),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.5)))+
facet_wrap(~State)
ggplot(bmerged_ds,
aes(x = ABV,
y = IBU)) +
geom_miss_point() +
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.3)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=8),
axis.text.y = element_text(vjust=0.2,size=6),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.5)))+
facet_wrap(~Style)
bmerged_final_ds <- bmerged_ds %>% filter(!is.na(ABV) | !is.na(IBU))
non_missing_ibu <- bmerged_final_ds %>% filter(!is.na(IBU))
non_missing_ibu$imputed <- "N"
missing_ibu <- bmerged_final_ds %>% filter(is.na(IBU))
missing_ibu$imputed <- "Y"
model_ibu <- non_missing_ibu %>% lm(formula=log(IBU)~log(ABV))
plot(model_ibu)
summary(model_ibu)
##
## Call:
## lm(formula = log(IBU) ~ log(ABV), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.81374 -0.30156 0.08312 0.38514 1.35081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.99667 0.18115 49.66 <2e-16 ***
## log(ABV) 1.91683 0.06363 30.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.514 on 1403 degrees of freedom
## Multiple R-squared: 0.3927, Adjusted R-squared: 0.3923
## F-statistic: 907.4 on 1 and 1403 DF, p-value: < 2.2e-16
missing_ibu$IBU <- round(exp(predict(model_ibu,missing_ibu)),0)
imputed_df <- rbind(non_missing_ibu,missing_ibu)
imputed_df$imputed <- as.factor(imputed_df$imputed)
imputed_df %>% ggplot(aes(x=ABV,y=IBU,color=imputed))+
geom_point()+ggtitle("ABV Vs IBU") +
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.3))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=8),
axis.text.y = element_text(vjust=0.2,size=6),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.5)))
abv_by_states <- imputed_df %>% select(State,ABV) %>% group_by(State) %>% summarise(Median=as.numeric(median(ABV))) %>% arrange(desc(Median))
abv_by_states$State <- factor(abv_by_states$State, levels = abv_by_states$State)
ggplot(abv_by_states, aes(x=State, y=Median)) +
geom_bar(stat="identity", width=.8, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Median ABV Vs State",
caption="Beer Analysis") +
xlab("State")+
ylab("Alcohol content")+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.6))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=7),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
ibu_by_states <- imputed_df %>% select(State,IBU) %>% group_by(State) %>% summarise(Median=as.numeric(median(IBU))) %>% arrange(desc(Median))
ibu_by_states$State <- factor(ibu_by_states$State, levels = ibu_by_states$State)
ggplot(ibu_by_states, aes(x=State, y=Median)) +
geom_bar(stat="identity", width=.8, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Median IBU Vs State",
caption="Beer Analysis") +
xlab("State")+
ylab("Alcohol content")+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.6))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=7),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
imputed_df %>% ggplot(aes(y=ABV,x=State)) +
geom_boxplot(aes(fill=State), alpha=0.8,show.legend = FALSE) +
labs(title="Alcohol Content",
caption="Beer Analysis") +
xlab("State")+
ylab("Alcohol content")+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.6))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=7),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
imputed_df %>% ggplot(aes(x=ABV)) +
geom_density(aes(fill=State), alpha=0.8) +
labs(title="Density Plot",
caption="Beer Analysis") +
xlab("State")+
ylab("Alcohol content")+
theme_wsj()+
theme(axis.text.x = element_text(angle=65, vjust=0.6))+
theme(plot.title = element_text(size = rel(0.5)),
plot.subtitle = element_text(size = rel(0.5)),
axis.text.x = element_text(vjust=0.6,size=7),
axis.title = element_text(size = rel(0.5)),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_text(size = rel(0.2)))
## Warning: Groups with fewer than two data points have been dropped.